In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
In [5]:
comments = pd.read_csv(r'C:\Users\supri\Downloads\UScomments.csv',  on_bad_lines='skip')
C:\Users\supri\AppData\Local\Temp\ipykernel_25360\2911899788.py:1: DtypeWarning: Columns (2,3) have mixed types. Specify dtype option on import or set low_memory=False.
  comments = pd.read_csv(r'C:\Users\supri\Downloads\UScomments.csv',  on_bad_lines='skip')
In [6]:
comments.head()
Out[6]:
video_id comment_text likes replies
0 XpVt6Z1Gjjo Logan Paul it's yo big day ‼️‼️‼️ 4 0
1 XpVt6Z1Gjjo I've been following you from the start of your... 3 0
2 XpVt6Z1Gjjo Say hi to Kong and maverick for me 3 0
3 XpVt6Z1Gjjo MY FAN . attendance 3 0
4 XpVt6Z1Gjjo trending 😉 3 0
In [7]:
comments.isnull().sum()
Out[7]:
video_id         0
comment_text    26
likes            0
replies          0
dtype: int64
In [8]:
comments.dropna(inplace=True)
In [9]:
comments.isnull().sum()
Out[9]:
video_id        0
comment_text    0
likes           0
replies         0
dtype: int64

Performing Sentiment Analysis

In [10]:
!pip install textblob
Requirement already satisfied: textblob in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (0.18.0.post0)
Requirement already satisfied: nltk>=3.8 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from textblob) (3.8.1)
Requirement already satisfied: click in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from nltk>=3.8->textblob) (8.1.7)
Requirement already satisfied: joblib in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from nltk>=3.8->textblob) (1.4.0)
Requirement already satisfied: regex>=2021.8.3 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from nltk>=3.8->textblob) (2023.12.25)
Requirement already satisfied: tqdm in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from nltk>=3.8->textblob) (4.66.2)
Requirement already satisfied: colorama in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from click->nltk>=3.8->textblob) (0.4.6)
In [11]:
from textblob import TextBlob
In [12]:
comments.head(6)
Out[12]:
video_id comment_text likes replies
0 XpVt6Z1Gjjo Logan Paul it's yo big day ‼️‼️‼️ 4 0
1 XpVt6Z1Gjjo I've been following you from the start of your... 3 0
2 XpVt6Z1Gjjo Say hi to Kong and maverick for me 3 0
3 XpVt6Z1Gjjo MY FAN . attendance 3 0
4 XpVt6Z1Gjjo trending 😉 3 0
5 XpVt6Z1Gjjo #1 on trending AYYEEEEE 3 0
In [13]:
TextBlob("Logan Paul it's yo big day ‼️‼️‼️").sentiment.polarity
Out[13]:
0.0
In [14]:
comments.shape
Out[14]:
(691374, 4)
In [15]:
sample_df = comments[0:1000]
In [16]:
sample_df.shape
Out[16]:
(1000, 4)
In [ ]:
 
In [17]:
polarity = []

for comment in comments['comment_text']:
    try:
        polarity.append(TextBlob(comment).sentiment.polarity)
    except:
        polarity.append(0)
In [18]:
len(polarity)
Out[18]:
691374
In [19]:
comments['polarity'] = polarity
In [20]:
comments.head()
Out[20]:
video_id comment_text likes replies polarity
0 XpVt6Z1Gjjo Logan Paul it's yo big day ‼️‼️‼️ 4 0 0.0
1 XpVt6Z1Gjjo I've been following you from the start of your... 3 0 0.0
2 XpVt6Z1Gjjo Say hi to Kong and maverick for me 3 0 0.0
3 XpVt6Z1Gjjo MY FAN . attendance 3 0 0.0
4 XpVt6Z1Gjjo trending 😉 3 0 0.0
In [21]:
filter1  = comments['polarity'] == 1
In [22]:
comments_positive =comments[filter1]
In [23]:
comments_positive.head(5)
Out[23]:
video_id comment_text likes replies polarity
64 XpVt6Z1Gjjo yu are the best 1 0 1.0
156 cLdxuaxaQwc Power is the disease.  Care is the cure.  Keep... 0 0 1.0
227 WYYvHb03Eog YAS Can't wait to get it! I just need to sell ... 0 0 1.0
307 sjlHnJvXdQs This is priceless 0 0 1.0
319 sjlHnJvXdQs Summed up perfectly 0 0 1.0

Performing Wordcloud Analysis

In [24]:
!pip install wordcloud
Requirement already satisfied: wordcloud in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (1.9.3)
Requirement already satisfied: numpy>=1.6.1 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from wordcloud) (1.26.4)
Requirement already satisfied: pillow in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from wordcloud) (10.3.0)
Requirement already satisfied: matplotlib in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from wordcloud) (3.8.4)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->wordcloud) (1.2.1)
Requirement already satisfied: cycler>=0.10 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->wordcloud) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->wordcloud) (4.51.0)
Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->wordcloud) (1.4.5)
Requirement already satisfied: packaging>=20.0 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->wordcloud) (24.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->wordcloud) (3.1.2)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->wordcloud) (2.9.0.post0)
Requirement already satisfied: six>=1.5 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.16.0)
In [25]:
from wordcloud import WordCloud, STOPWORDS
In [26]:
set(STOPWORDS)
Out[26]:
{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'all',
 'also',
 'am',
 'an',
 'and',
 'any',
 'are',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 "can't",
 'cannot',
 'com',
 'could',
 "couldn't",
 'did',
 "didn't",
 'do',
 'does',
 "doesn't",
 'doing',
 "don't",
 'down',
 'during',
 'each',
 'else',
 'ever',
 'few',
 'for',
 'from',
 'further',
 'get',
 'had',
 "hadn't",
 'has',
 "hasn't",
 'have',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'hence',
 'her',
 'here',
 "here's",
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 "how's",
 'however',
 'http',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'k',
 "let's",
 'like',
 'me',
 'more',
 'most',
 "mustn't",
 'my',
 'myself',
 'no',
 'nor',
 'not',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'otherwise',
 'ought',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r',
 'same',
 'shall',
 "shan't",
 'she',
 "she'd",
 "she'll",
 "she's",
 'should',
 "shouldn't",
 'since',
 'so',
 'some',
 'such',
 'than',
 'that',
 "that's",
 'the',
 'their',
 'theirs',
 'them',
 'themselves',
 'then',
 'there',
 "there's",
 'therefore',
 'these',
 'they',
 "they'd",
 "they'll",
 "they're",
 "they've",
 'this',
 'those',
 'through',
 'to',
 'too',
 'under',
 'until',
 'up',
 'very',
 'was',
 "wasn't",
 'we',
 "we'd",
 "we'll",
 "we're",
 "we've",
 'were',
 "weren't",
 'what',
 "what's",
 'when',
 "when's",
 'where',
 "where's",
 'which',
 'while',
 'who',
 "who's",
 'whom',
 'why',
 "why's",
 'with',
 "won't",
 'would',
 "wouldn't",
 'www',
 'you',
 "you'd",
 "you'll",
 "you're",
 "you've",
 'your',
 'yours',
 'yourself',
 'yourselves'}
In [27]:
comments['comment_text']
Out[27]:
0                         Logan Paul it's yo big day ‼️‼️‼️
1         I've been following you from the start of your...
2                        Say hi to Kong and maverick for me
3                                       MY FAN . attendance
4                                                trending 😉
                                ...                        
691395                                               Лучшая
691396    qu'est ce que j'aimerais que tu viennes à Roan...
691397                            Ven a mexico! 😍 te amo LP
691398                                      Islığı yeter...
691399    Kocham tą piosenkę😍❤❤❤byłam zakochana po uszy ...
Name: comment_text, Length: 691374, dtype: object
In [28]:
type(comments['comment_text'])
Out[28]:
pandas.core.series.Series
In [29]:
total_comments_positive = ' '.join(comments_positive['comment_text'])
In [30]:
wordcloud_positive = WordCloud(stopwords = set(STOPWORDS)).generate(total_comments_positive)
In [31]:
plt.imshow(wordcloud_positive)
plt.axis('off')
Out[31]:
(-0.5, 399.5, 199.5, -0.5)
No description has been provided for this image
In [32]:
filter2 = comments['polarity'] == -1
In [33]:
comments_negative = comments[filter2]
In [34]:
comments_negative.head(5)
Out[34]:
video_id comment_text likes replies polarity
512 8wNr-NQImFg BEN CARSON IS THE MAN!!!!! THEY HATE HIM CAUSE... 0 0 -1.0
562 8wNr-NQImFg Well… The brain surgeon Ben Carson just proved... 0 0 -1.0
952 Ayb_2qbZHm4 WHY DID YOU MAKE FURRY FORCE?! SO NASTY!!! 0 0 -1.0
1371 vu_9muoxT50 WTF BRUH!!!!!! 0 0 -1.0
1391 vu_9muoxT50 cheeseus christ thats insane!!! 0 0 -1.0
In [35]:
total_comments_negative = ' '.join(comments_negative['comment_text'])
In [36]:
wordcloud_negative = WordCloud(stopwords = set(STOPWORDS)).generate(total_comments_negative)
In [37]:
plt.imshow(wordcloud_negative)
plt.axis('off')
Out[37]:
(-0.5, 399.5, 199.5, -0.5)
No description has been provided for this image

Perform Emoji Analysis

In [38]:
!pip install emoji==2.2.0
Requirement already satisfied: emoji==2.2.0 in c:\users\supri\appdata\local\programs\python\python312\lib\site-packages (2.2.0)
In [39]:
import emoji
In [40]:
emoji.__version__
Out[40]:
'2.2.0'
In [41]:
comments['comment_text'].head(5)
Out[41]:
0                    Logan Paul it's yo big day ‼️‼️‼️
1    I've been following you from the start of your...
2                   Say hi to Kong and maverick for me
3                                  MY FAN . attendance
4                                           trending 😉
Name: comment_text, dtype: object
In [42]:
comment = 'trending 😉'
In [43]:
[char for char in comment if char in emoji.EMOJI_DATA]
Out[43]:
['😉']
In [44]:
emoji_list = []
for comment in comments['comment_text']:
    for char in comment:
        if char in emoji.EMOJI_DATA:
            emoji_list.append(char)
In [45]:
emoji_list[0:10]
Out[45]:
['‼', '‼', '‼', '😉', '😭', '👍', '🏻', '❤', '😍', '💋']
In [46]:
from collections import Counter
In [47]:
Counter(emoji_list).most_common(10)
Out[47]:
[('😂', 36987),
 ('😍', 33453),
 ('❤', 31119),
 ('🔥', 8694),
 ('😭', 8398),
 ('👏', 5719),
 ('😘', 5545),
 ('👍', 5476),
 ('💖', 5359),
 ('💕', 5147)]
In [48]:
emojis = [Counter(emoji_list).most_common(10)[i][0]for i in range(10)]
In [49]:
emojis
Out[49]:
['😂', '😍', '❤', '🔥', '😭', '👏', '😘', '👍', '💖', '💕']
In [50]:
frequencies = [Counter(emoji_list).most_common(10)[i][1]for i in range(10)]
In [51]:
frequencies
Out[51]:
[36987, 33453, 31119, 8694, 8398, 5719, 5545, 5476, 5359, 5147]
In [52]:
import plotly.graph_objs as go
from plotly.offline import iplot
In [53]:
trace = go.Bar(x=emojis, y=frequencies)
In [54]:
iplot([trace])

Collect Entire Data of Youtube: Data Collection

In [55]:
import os
In [63]:
files = os.listdir(r'D:\youtube\additional_data')
In [64]:
files
Out[64]:
['CAvideos.csv',
 'CA_category_id.json',
 'DEvideos.csv',
 'DE_category_id.json',
 'FRvideos.csv',
 'FR_category_id.json',
 'GBvideos.csv',
 'GB_category_id.json',
 'INvideos.csv',
 'IN_category_id.json',
 'JPvideos.csv',
 'JP_category_id.json',
 'KRvideos.csv',
 'KR_category_id.json',
 'MXvideos.csv',
 'MX_category_id.json',
 'RUvideos.csv',
 'RU_category_id.json',
 'USvideos.csv',
 'US_category_id.json']
In [65]:
files_csv = [file for file in files if '.csv' in file]
In [66]:
files_csv
Out[66]:
['CAvideos.csv',
 'DEvideos.csv',
 'FRvideos.csv',
 'GBvideos.csv',
 'INvideos.csv',
 'JPvideos.csv',
 'KRvideos.csv',
 'MXvideos.csv',
 'RUvideos.csv',
 'USvideos.csv']
In [67]:
import warnings 
from warnings import filterwarnings
filterwarnings('ignore')
In [75]:
full_df = pd.DataFrame()
path = r'D:\youtube\additional_data'

for file in files_csv:
    current_df = pd.read_csv(path+'/'+file , encoding='iso-8859-1', on_bad_lines='skip')
    full_df = pd.concat([full_df, current_df] , ignore_index=True)
In [76]:
full_df.shape
Out[76]:
(375942, 16)
In [78]:
full_df[full_df.duplicated()].shape
Out[78]:
(36417, 16)
In [80]:
full_df = full_df.drop_duplicates()
In [82]:
full_df.shape
Out[82]:
(339525, 16)

How to export data into csv, json, database etc.

In [87]:
full_df[0:1000].to_csv(r'D:\youtube\Export_data/youtube_sample.csv', index=False)
In [89]:
full_df[0:1000].to_json(r'D:\youtube\Export_data/youtube_sample.json')
In [91]:
from sqlalchemy import create_engine
In [95]:
engine = create_engine(r'sqlite:///D:\youtube\Export_data/youtube_sample.sqlite')
In [96]:
full_df[0:1000].to_sql('User',con = engine,if_exists = 'append')
Out[96]:
1000

Analysing the most liked category!

In [97]:
full_df.head()
Out[97]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes comment_count thumbnail_link comments_disabled ratings_disabled video_error_or_removed description
0 n1WpP7iowLc 17.14.11 Eminem - Walk On Water (Audio) ft. Beyoncé EminemVEVO 10 2017-11-10T17:00:03.000Z Eminem|"Walk"|"On"|"Water"|"Aftermath/Shady/In... 17158579 787425 43420 125882 https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg False False False Eminem's new track Walk on Water ft. Beyoncé ...
1 0dBIkQ4Mz1M 17.14.11 PLUSH - Bad Unboxing Fan Mail iDubbbzTV 23 2017-11-13T17:00:00.000Z plush|"bad unboxing"|"unboxing"|"fan mail"|"id... 1014651 127794 1688 13030 https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg False False False STill got a lot of packages. Probably will las...
2 5qpjK5DgCt4 17.14.11 Racist Superman | Rudy Mancuso, King Bach & Le... Rudy Mancuso 23 2017-11-12T19:05:24.000Z racist superman|"rudy"|"mancuso"|"king"|"bach"... 3191434 146035 5339 8181 https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg False False False WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► ...
3 d380meD0W0M 17.14.11 I Dare You: GOING BALD!? nigahiga 24 2017-11-12T18:01:41.000Z ryan|"higa"|"higatv"|"nigahiga"|"i dare you"|"... 2095828 132239 1989 17518 https://i.ytimg.com/vi/d380meD0W0M/default.jpg False False False I know it's been a while since we did this sho...
4 2Vv-BfVoq4g 17.14.11 Ed Sheeran - Perfect (Official Music Video) Ed Sheeran 10 2017-11-09T11:04:14.000Z edsheeran|"ed sheeran"|"acoustic"|"live"|"cove... 33523622 1634130 21082 85067 https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg False False False 🎧: https://ad.gt/yt-perfect\n💰: https://...
In [98]:
full_df['category_id'].unique()
Out[98]:
array([10, 23, 24, 25, 22, 26,  1, 28, 20, 17, 29, 15, 19,  2, 27, 43, 30,
       44], dtype=int64)
In [105]:
json_df = pd.read_json(r'D:\youtube\additional_data\US_category_id.json')
In [106]:
json_df
Out[106]:
kind etag items
0 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
1 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
2 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
3 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
4 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
5 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
6 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
7 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
8 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
9 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
10 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
11 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
12 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
13 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
14 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
15 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
16 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
17 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
18 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
19 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
20 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
21 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
22 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
23 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
24 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
25 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
26 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
27 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
28 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
29 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
30 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
31 youtube#videoCategoryListResponse "m2yskBQFythfE4irbTIeOgYYfBU/S730Ilt-Fi-emsQJv... {'kind': 'youtube#videoCategory', 'etag': '"m2...
In [107]:
json_df['items']
Out[107]:
0     {'kind': 'youtube#videoCategory', 'etag': '"m2...
1     {'kind': 'youtube#videoCategory', 'etag': '"m2...
2     {'kind': 'youtube#videoCategory', 'etag': '"m2...
3     {'kind': 'youtube#videoCategory', 'etag': '"m2...
4     {'kind': 'youtube#videoCategory', 'etag': '"m2...
5     {'kind': 'youtube#videoCategory', 'etag': '"m2...
6     {'kind': 'youtube#videoCategory', 'etag': '"m2...
7     {'kind': 'youtube#videoCategory', 'etag': '"m2...
8     {'kind': 'youtube#videoCategory', 'etag': '"m2...
9     {'kind': 'youtube#videoCategory', 'etag': '"m2...
10    {'kind': 'youtube#videoCategory', 'etag': '"m2...
11    {'kind': 'youtube#videoCategory', 'etag': '"m2...
12    {'kind': 'youtube#videoCategory', 'etag': '"m2...
13    {'kind': 'youtube#videoCategory', 'etag': '"m2...
14    {'kind': 'youtube#videoCategory', 'etag': '"m2...
15    {'kind': 'youtube#videoCategory', 'etag': '"m2...
16    {'kind': 'youtube#videoCategory', 'etag': '"m2...
17    {'kind': 'youtube#videoCategory', 'etag': '"m2...
18    {'kind': 'youtube#videoCategory', 'etag': '"m2...
19    {'kind': 'youtube#videoCategory', 'etag': '"m2...
20    {'kind': 'youtube#videoCategory', 'etag': '"m2...
21    {'kind': 'youtube#videoCategory', 'etag': '"m2...
22    {'kind': 'youtube#videoCategory', 'etag': '"m2...
23    {'kind': 'youtube#videoCategory', 'etag': '"m2...
24    {'kind': 'youtube#videoCategory', 'etag': '"m2...
25    {'kind': 'youtube#videoCategory', 'etag': '"m2...
26    {'kind': 'youtube#videoCategory', 'etag': '"m2...
27    {'kind': 'youtube#videoCategory', 'etag': '"m2...
28    {'kind': 'youtube#videoCategory', 'etag': '"m2...
29    {'kind': 'youtube#videoCategory', 'etag': '"m2...
30    {'kind': 'youtube#videoCategory', 'etag': '"m2...
31    {'kind': 'youtube#videoCategory', 'etag': '"m2...
Name: items, dtype: object
In [109]:
json_df['items'][0]
Out[109]:
{'kind': 'youtube#videoCategory',
 'etag': '"m2yskBQFythfE4irbTIeOgYYfBU/Xy1mB4_yLrHy_BmKmPBggty2mZQ"',
 'id': '1',
 'snippet': {'channelId': 'UCBR8-60-B28hp2BmDPdntcQ',
  'title': 'Film & Animation',
  'assignable': True}}
In [108]:
cat_dict = {}

for item in json_df['items'].values:
    cat_dict[int(item['id'])] =item['snippet']['title']
In [110]:
cat_dict
Out[110]:
{1: 'Film & Animation',
 2: 'Autos & Vehicles',
 10: 'Music',
 15: 'Pets & Animals',
 17: 'Sports',
 18: 'Short Movies',
 19: 'Travel & Events',
 20: 'Gaming',
 21: 'Videoblogging',
 22: 'People & Blogs',
 23: 'Comedy',
 24: 'Entertainment',
 25: 'News & Politics',
 26: 'Howto & Style',
 27: 'Education',
 28: 'Science & Technology',
 29: 'Nonprofits & Activism',
 30: 'Movies',
 31: 'Anime/Animation',
 32: 'Action/Adventure',
 33: 'Classics',
 34: 'Comedy',
 35: 'Documentary',
 36: 'Drama',
 37: 'Family',
 38: 'Foreign',
 39: 'Horror',
 40: 'Sci-Fi/Fantasy',
 41: 'Thriller',
 42: 'Shorts',
 43: 'Shows',
 44: 'Trailers'}
In [111]:
full_df['category_name'] = full_df['category_id'].map(cat_dict)
In [112]:
full_df.head()
Out[112]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes comment_count thumbnail_link comments_disabled ratings_disabled video_error_or_removed description category_name
0 n1WpP7iowLc 17.14.11 Eminem - Walk On Water (Audio) ft. Beyoncé EminemVEVO 10 2017-11-10T17:00:03.000Z Eminem|"Walk"|"On"|"Water"|"Aftermath/Shady/In... 17158579 787425 43420 125882 https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg False False False Eminem's new track Walk on Water ft. Beyoncé ... Music
1 0dBIkQ4Mz1M 17.14.11 PLUSH - Bad Unboxing Fan Mail iDubbbzTV 23 2017-11-13T17:00:00.000Z plush|"bad unboxing"|"unboxing"|"fan mail"|"id... 1014651 127794 1688 13030 https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg False False False STill got a lot of packages. Probably will las... Comedy
2 5qpjK5DgCt4 17.14.11 Racist Superman | Rudy Mancuso, King Bach & Le... Rudy Mancuso 23 2017-11-12T19:05:24.000Z racist superman|"rudy"|"mancuso"|"king"|"bach"... 3191434 146035 5339 8181 https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg False False False WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► ... Comedy
3 d380meD0W0M 17.14.11 I Dare You: GOING BALD!? nigahiga 24 2017-11-12T18:01:41.000Z ryan|"higa"|"higatv"|"nigahiga"|"i dare you"|"... 2095828 132239 1989 17518 https://i.ytimg.com/vi/d380meD0W0M/default.jpg False False False I know it's been a while since we did this sho... Entertainment
4 2Vv-BfVoq4g 17.14.11 Ed Sheeran - Perfect (Official Music Video) Ed Sheeran 10 2017-11-09T11:04:14.000Z edsheeran|"ed sheeran"|"acoustic"|"live"|"cove... 33523622 1634130 21082 85067 https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg False False False 🎧: https://ad.gt/yt-perfect\n💰: https://... Music
In [119]:
plt.figure(figsize=(12,8))
sns.boxplot(x='category_name' ,  y = 'likes', data = full_df)
plt.xticks(rotation='vertical')
Out[119]:
([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
 [Text(0, 0, 'Music'),
  Text(1, 0, 'Comedy'),
  Text(2, 0, 'Entertainment'),
  Text(3, 0, 'News & Politics'),
  Text(4, 0, 'People & Blogs'),
  Text(5, 0, 'Howto & Style'),
  Text(6, 0, 'Film & Animation'),
  Text(7, 0, 'Science & Technology'),
  Text(8, 0, 'Gaming'),
  Text(9, 0, 'Sports'),
  Text(10, 0, 'Nonprofits & Activism'),
  Text(11, 0, 'Pets & Animals'),
  Text(12, 0, 'Travel & Events'),
  Text(13, 0, 'Autos & Vehicles'),
  Text(14, 0, 'Education'),
  Text(15, 0, 'Shows'),
  Text(16, 0, 'Movies'),
  Text(17, 0, 'Trailers')])
No description has been provided for this image

Analyse whether the audience is engaged or not!

In [121]:
full_df['like_rate']=(full_df['likes']/full_df['views'])*100
full_df['dislike_rate']=(full_df['dislikes']/full_df['views'])*100
full_df['comment_count_rate']=(full_df['comment_count']/full_df['views'])*100
In [122]:
full_df['like_rate']
Out[122]:
0          4.589104
1         12.594873
2          4.575843
3          6.309630
4          4.874563
            ...    
375936     7.820293
375938     5.635623
375939     4.507286
375940     3.408645
375941     3.464728
Name: like_rate, Length: 339525, dtype: float64
In [123]:
full_df['dislike_rate']
Out[123]:
0         0.253051
1         0.166363
2         0.167292
3         0.094903
4         0.062887
            ...   
375936    0.049061
375938    0.035875
375939    0.096770
375940    0.050275
375941    2.066500
Name: dislike_rate, Length: 339525, dtype: float64
In [124]:
full_df['comment_count_rate']
Out[124]:
0         0.733639
1         1.284185
2         0.256342
3         0.835851
4         0.253752
            ...   
375936    0.758070
375938    0.369648
375939    0.374326
375940    0.231204
375941    1.404942
Name: comment_count_rate, Length: 339525, dtype: float64
In [125]:
full_df.columns
Out[125]:
Index(['video_id', 'trending_date', 'title', 'channel_title', 'category_id',
       'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count',
       'thumbnail_link', 'comments_disabled', 'ratings_disabled',
       'video_error_or_removed', 'description', 'category_name', 'like_rate',
       'dislike_rate', 'comment_count_rate'],
      dtype='object')
In [128]:
plt.figure(figsize=(8,6))
sns.boxplot(x='category_name' ,  y = 'like_rate', data = full_df)
plt.xticks(rotation='vertical')
plt.show()
No description has been provided for this image
In [129]:
sns.regplot(x='views', y ='likes', data = full_df)
Out[129]:
<Axes: xlabel='views', ylabel='likes'>
No description has been provided for this image
In [131]:
full_df[['views', 'likes', 'dislikes']].corr()
Out[131]:
views likes dislikes
views 1.000000 0.779531 0.405428
likes 0.779531 1.000000 0.451809
dislikes 0.405428 0.451809 1.000000
In [132]:
sns.heatmap(full_df[['views', 'likes', 'dislikes']].corr(), annot = True)
Out[132]:
<Axes: >
No description has been provided for this image

Trending Videos on YouTube

In [133]:
full_df.head()
Out[133]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes comment_count thumbnail_link comments_disabled ratings_disabled video_error_or_removed description category_name like_rate dislike_rate comment_count_rate
0 n1WpP7iowLc 17.14.11 Eminem - Walk On Water (Audio) ft. Beyoncé EminemVEVO 10 2017-11-10T17:00:03.000Z Eminem|"Walk"|"On"|"Water"|"Aftermath/Shady/In... 17158579 787425 43420 125882 https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg False False False Eminem's new track Walk on Water ft. Beyoncé ... Music 4.589104 0.253051 0.733639
1 0dBIkQ4Mz1M 17.14.11 PLUSH - Bad Unboxing Fan Mail iDubbbzTV 23 2017-11-13T17:00:00.000Z plush|"bad unboxing"|"unboxing"|"fan mail"|"id... 1014651 127794 1688 13030 https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg False False False STill got a lot of packages. Probably will las... Comedy 12.594873 0.166363 1.284185
2 5qpjK5DgCt4 17.14.11 Racist Superman | Rudy Mancuso, King Bach & Le... Rudy Mancuso 23 2017-11-12T19:05:24.000Z racist superman|"rudy"|"mancuso"|"king"|"bach"... 3191434 146035 5339 8181 https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg False False False WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► ... Comedy 4.575843 0.167292 0.256342
3 d380meD0W0M 17.14.11 I Dare You: GOING BALD!? nigahiga 24 2017-11-12T18:01:41.000Z ryan|"higa"|"higatv"|"nigahiga"|"i dare you"|"... 2095828 132239 1989 17518 https://i.ytimg.com/vi/d380meD0W0M/default.jpg False False False I know it's been a while since we did this sho... Entertainment 6.309630 0.094903 0.835851
4 2Vv-BfVoq4g 17.14.11 Ed Sheeran - Perfect (Official Music Video) Ed Sheeran 10 2017-11-09T11:04:14.000Z edsheeran|"ed sheeran"|"acoustic"|"live"|"cove... 33523622 1634130 21082 85067 https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg False False False 🎧: https://ad.gt/yt-perfect\n💰: https://... Music 4.874563 0.062887 0.253752
In [135]:
full_df['channel_title'].value_counts()
Out[135]:
channel_title
The Late Show with Stephen Colbert    710
WWE                                   643
Late Night with Seth Meyers           592
TheEllenShow                          555
Jimmy Kimmel Live                     528
                                     ... 
Daas                                    1
YT Industries                           1
BTLV Le média complémentaire          1
Quem Sabia ?                            1
Jessi Osorno                            1
Name: count, Length: 37824, dtype: int64
In [139]:
cdf = full_df.groupby(['channel_title']).size().sort_values(ascending=False).reset_index()
In [140]:
cdf = cdf.rename(columns = {0:'total_videos'})
In [141]:
cdf
Out[141]:
channel_title total_videos
0 The Late Show with Stephen Colbert 710
1 WWE 643
2 Late Night with Seth Meyers 592
3 TheEllenShow 555
4 Jimmy Kimmel Live 528
... ... ...
37819 Kd Malts 1
37820 Zedan TV 1
37821 Kc Kelly - Rocketprenuer 1
37822 Kbaby 1
37823 Pavel Sidorik TV 1

37824 rows × 2 columns

In [142]:
import plotly.express as px
In [146]:
px.bar(data_frame = cdf[0:20], x = 'channel_title', y='total_videos')

Does punctuations have impact on views , likes, dislikes?

In [147]:
full_df['title'][0]
Out[147]:
'Eminem - Walk On Water (Audio) ft. Beyoncé'
In [148]:
import string 
In [150]:
string.punctuation
Out[150]:
'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
In [152]:
len([char for char in full_df['title'][0] if char in string.punctuation])
Out[152]:
4
In [163]:
def punc_count(text):
    return len([char for char in text if char in string.punctuation])
In [168]:
sample = full_df[0:10000]
In [169]:
sample['count_punc'] = sample['title'].apply(punc_count)
In [170]:
sample['count_punc']
Out[170]:
0       4
1       1
2       3
3       3
4       3
       ..
9995    6
9996    0
9997    1
9998    0
9999    6
Name: count_punc, Length: 10000, dtype: int64
In [171]:
plt.figure(figsize=(8,6))
sns.boxplot(x='count_punc' ,  y = 'likes', data = sample)
plt.show()
No description has been provided for this image
In [172]:
plt.figure(figsize=(8,6))
sns.boxplot(x='count_punc' ,  y = 'views', data = sample)
plt.show()
No description has been provided for this image
In [174]:
plt.figure(figsize=(8,6))
sns.boxplot(x='count_punc' ,  y = 'dislikes', data = sample)
plt.show()
No description has been provided for this image
In [ ]: